{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Pandas通用设置"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:10.282407Z",
     "start_time": "2020-05-12T07:56:10.183767Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'1.0.3'"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "pd.__version__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:10.285700Z",
     "start_time": "2020-05-12T07:56:10.283472Z"
    }
   },
   "outputs": [],
   "source": [
    "# 设置\n",
    "import pandas as pd\n",
    "\n",
    "display_settings = {\n",
    "    'max_columns': 10,\n",
    "    'expand_frame_repr': True,  # Wrap to multiple pages\n",
    "    'max_rows': 10,\n",
    "    'precision': 2,\n",
    "    'show_dimensions': True\n",
    "}\n",
    "\n",
    "for op, value in display_settings.items():\n",
    "    pd.set_option(\"display.{}\".format(op), value)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Pandas对象\n",
    "\n",
    "* 三种基本 Pandas 数据结构:\n",
    "**Series, DataFrame, Index**"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-10-29T11:36:44.960819Z",
     "start_time": "2018-10-29T11:36:44.945165Z"
    },
    "heading_collapsed": true
   },
   "source": [
    "## Series对象"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:10.289986Z",
     "start_time": "2020-05-12T07:56:10.286859Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    0.25\n",
       "1    0.50\n",
       "2    0.75\n",
       "3    1.00\n",
       "Length: 4, dtype: float64"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 基于数组\n",
    "data = pd.Series([0.25, 0.5, 0.75, 1.0])\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:10.293001Z",
     "start_time": "2020-05-12T07:56:10.290963Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "values: [0.25 0.5  0.75 1.  ] ，index: RangeIndex(start=0, stop=4, step=1)\n"
     ]
    }
   ],
   "source": [
    "# 对象的值与索引：\n",
    "\n",
    "print('values:', data.values, '，index:', data.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:10.296454Z",
     "start_time": "2020-05-12T07:56:10.293780Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    0.50\n",
       "2    0.75\n",
       "Length: 2, dtype: float64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 通过索引访问数据：\n",
    "data[1:3]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "### Series 作为一般的 NumPy 数组\n",
    "- Numpy 数组默认的整数索引\n",
    "- Series 还可以 显式 定义索引：字符串列表、整数列表(可以为非连续数列)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:10.300265Z",
     "start_time": "2020-05-12T07:56:10.297273Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "a    0.25\n",
       "b    0.50\n",
       "c    0.75\n",
       "d    1.00\n",
       "Length: 4, dtype: float64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:10.303259Z",
     "start_time": "2020-05-12T07:56:10.301123Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.5"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data['b']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:10.306993Z",
     "start_time": "2020-05-12T07:56:10.303998Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2    0.25\n",
       "5    0.50\n",
       "3    0.75\n",
       "7    1.00\n",
       "Length: 4, dtype: float64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 使用非连续的索引\n",
    "data = pd.Series([0.25, 0.5, 0.75, 1.0], index=[2, 5, 3, 7])\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:10.310409Z",
     "start_time": "2020-05-12T07:56:10.308232Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.5"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[5]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "### Series 作为特殊的词典 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:10.314916Z",
     "start_time": "2020-05-12T07:56:10.311306Z"
    },
    "code_folding": [],
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "California    38332521\n",
       "Texas         26448193\n",
       "New York      19651127\n",
       "Florida       19552860\n",
       "Illinois      12882135\n",
       "Length: 5, dtype: int64"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "population_dict = {\n",
    "    'California': 38332521,\n",
    "    'Texas': 26448193,\n",
    "    'New York': 19651127,\n",
    "    'Florida': 19552860,\n",
    "    'Illinois': 12882135\n",
    "}\n",
    "population = pd.Series(population_dict)\n",
    "population"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    ">**注意**：索引为显式定义的字符串，切片时<span class=\"burk\">包含终止索引点</span>的！！！！"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:10.319047Z",
     "start_time": "2020-05-12T07:56:10.315819Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Texas       26448193\n",
       "New York    19651127\n",
       "Florida     19552860\n",
       "Length: 3, dtype: int64"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "population['Texas':'Florida']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:10.322342Z",
     "start_time": "2020-05-12T07:56:10.319823Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "California    38332521\n",
       "Texas         26448193\n",
       "Length: 2, dtype: int64"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "population[:2]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "### 创建 Series 对象\n",
    "\n",
    "`pd.Series(data, index=index)`\n",
    "- 参数`data`可以为列表，Numpy数组，标量或词典\n",
    "- 若定义了 `index`，则只有 `index` 对应的项才激活"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:10.325838Z",
     "start_time": "2020-05-12T07:56:10.323026Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    2\n",
       "1    4\n",
       "2    6\n",
       "Length: 3, dtype: int64"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a = pd.Series([2, 4, 6])\n",
    "a"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:10.329888Z",
     "start_time": "2020-05-12T07:56:10.326475Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "100    5\n",
       "200    5\n",
       "300    5\n",
       "Length: 3, dtype: int64"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "b = pd.Series(5, index=[100, 200, 300])\n",
    "b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:10.333788Z",
     "start_time": "2020-05-12T07:56:10.330585Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2    a\n",
       "1    b\n",
       "3    c\n",
       "Length: 3, dtype: object"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "c = pd.Series({2: 'a', 1: 'b', 3: 'c'})\n",
    "c"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:10.337799Z",
     "start_time": "2020-05-12T07:56:10.334630Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3    c\n",
       "2    a\n",
       "Length: 2, dtype: object"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 索引 使用优先 指定的 index 属性\n",
    "pd.Series({2: 'a', 1: 'b', 3: 'c'}, index=[3, 2])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-10-29T11:36:07.011004Z",
     "start_time": "2018-10-29T11:36:06.995607Z"
    },
    "heading_collapsed": true
   },
   "source": [
    "## DataFrame对象"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true,
    "hidden": true
   },
   "source": [
    "### DataFrame 作为一般的 NumPy 数组"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:10.344637Z",
     "start_time": "2020-05-12T07:56:10.338539Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>population</th>\n",
       "      <th>area</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>California</th>\n",
       "      <td>38332521</td>\n",
       "      <td>423967</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Texas</th>\n",
       "      <td>26448193</td>\n",
       "      <td>695662</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>New York</th>\n",
       "      <td>19651127</td>\n",
       "      <td>141297</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Florida</th>\n",
       "      <td>19552860</td>\n",
       "      <td>170312</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Illinois</th>\n",
       "      <td>12882135</td>\n",
       "      <td>149995</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            population    area\n",
       "California    38332521  423967\n",
       "Texas         26448193  695662\n",
       "New York      19651127  141297\n",
       "Florida       19552860  170312\n",
       "Illinois      12882135  149995\n",
       "\n",
       "[5 rows x 2 columns]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "population_dict = {\n",
    "    'California': 38332521,\n",
    "    'Texas': 26448193,\n",
    "    'New York': 19651127,\n",
    "    'Florida': 19552860,\n",
    "    'Illinois': 12882135\n",
    "}\n",
    "population = pd.Series(population_dict)\n",
    "\n",
    "area_dict = {\n",
    "    'California': 423967,\n",
    "    'Texas': 695662,\n",
    "    'New York': 141297,\n",
    "    'Florida': 170312,\n",
    "    'Illinois': 149995\n",
    "}\n",
    "area = pd.Series(area_dict)\n",
    "\n",
    "states = pd.DataFrame({'population': population, 'area': area})\n",
    "\n",
    "states"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:10.347570Z",
     "start_time": "2020-05-12T07:56:10.345457Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 索引\n",
    "states.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:10.350215Z",
     "start_time": "2020-05-12T07:56:10.348234Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['population', 'area'], dtype='object')"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 列名\n",
    "states.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:10.352963Z",
     "start_time": "2020-05-12T07:56:10.351025Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[38332521,   423967],\n",
       "       [26448193,   695662],\n",
       "       [19651127,   141297],\n",
       "       [19552860,   170312],\n",
       "       [12882135,   149995]])"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 值\n",
    "states.values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true,
    "hidden": true
   },
   "source": [
    "### DataFrame 作为特殊的词典"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:10.355872Z",
     "start_time": "2020-05-12T07:56:10.353663Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "California    423967\n",
       "Texas         695662\n",
       "New York      141297\n",
       "Florida       170312\n",
       "Illinois      149995\n",
       "Name: area, Length: 5, dtype: int64"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 列名-该列的值，等价于字典的 键值对\n",
    "states['area']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true,
    "hidden": true
   },
   "source": [
    "### 创建 DataFrame 对象 \n",
    "- 通过 Series 对象\n",
    "- 通过词典组成的列表，每个词典的键位列名，每个词典代表一行\n",
    "- 通过值为 Series 对象的词典，词典的键为列名\n",
    "- 二维的 Numpy 数组"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:10.360049Z",
     "start_time": "2020-05-12T07:56:10.356558Z"
    },
    "code_folding": [],
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>population</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>California</th>\n",
       "      <td>38332521</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Texas</th>\n",
       "      <td>26448193</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>New York</th>\n",
       "      <td>19651127</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Florida</th>\n",
       "      <td>19552860</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Illinois</th>\n",
       "      <td>12882135</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            population\n",
       "California    38332521\n",
       "Texas         26448193\n",
       "New York      19651127\n",
       "Florida       19552860\n",
       "Illinois      12882135\n",
       "\n",
       "[5 rows x 1 columns]"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 通过 Series 对象\n",
    "pd.DataFrame(population, columns=['population'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:10.365039Z",
     "start_time": "2020-05-12T07:56:10.360790Z"
    },
    "code_folding": [],
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>9</td>\n",
       "      <td>15</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>4 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   a  b   c\n",
       "0  0  0   0\n",
       "1  1  1   5\n",
       "2  2  4  10\n",
       "3  3  9  15\n",
       "\n",
       "[4 rows x 3 columns]"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 通过 字典 组成的列表\n",
    "data = [{'a': i, 'b': i**2, 'c': 5 * i} for i in range(4)]\n",
    "pd.DataFrame(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:10.370111Z",
     "start_time": "2020-05-12T07:56:10.365704Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>3</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     a  b    c\n",
       "0  1.0  2  NaN\n",
       "1  NaN  3  4.0\n",
       "\n",
       "[2 rows x 3 columns]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 通过 字典 组成的列表，缺失的用 NaN 填充\n",
    "pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:10.374823Z",
     "start_time": "2020-05-12T07:56:10.370904Z"
    },
    "code_folding": [],
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>population</th>\n",
       "      <th>area</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>California</th>\n",
       "      <td>38332521</td>\n",
       "      <td>423967</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Texas</th>\n",
       "      <td>26448193</td>\n",
       "      <td>695662</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>New York</th>\n",
       "      <td>19651127</td>\n",
       "      <td>141297</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Florida</th>\n",
       "      <td>19552860</td>\n",
       "      <td>170312</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Illinois</th>\n",
       "      <td>12882135</td>\n",
       "      <td>149995</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            population    area\n",
       "California    38332521  423967\n",
       "Texas         26448193  695662\n",
       "New York      19651127  141297\n",
       "Florida       19552860  170312\n",
       "Illinois      12882135  149995\n",
       "\n",
       "[5 rows x 2 columns]"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame({'population': population, 'area': area})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:10.379755Z",
     "start_time": "2020-05-12T07:56:10.375526Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>foo</th>\n",
       "      <th>bar</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>0.74</td>\n",
       "      <td>0.55</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>0.40</td>\n",
       "      <td>0.38</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>0.21</td>\n",
       "      <td>0.39</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    foo   bar\n",
       "a  0.74  0.55\n",
       "b  0.40  0.38\n",
       "c  0.21  0.39\n",
       "\n",
       "[3 rows x 2 columns]"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 通过二维 NumPy 数组\n",
    "\n",
    "pd.DataFrame(np.random.rand(3, 2),\n",
    "             columns=['foo', 'bar'],\n",
    "             index=['a', 'b', 'c'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:10.383536Z",
     "start_time": "2020-05-12T07:56:10.381176Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([(0, 0.), (0, 0.), (0, 0.)], dtype=[('A', '<i8'), ('B', '<f8')])"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# From a NumPy structured array\n",
    "\n",
    "A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')])\n",
    "A"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:10.388627Z",
     "start_time": "2020-05-12T07:56:10.384327Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   A    B\n",
       "0  0  0.0\n",
       "1  0  0.0\n",
       "2  0  0.0\n",
       "\n",
       "[3 rows x 2 columns]"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame(A)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-10-29T11:58:26.520123Z",
     "start_time": "2018-10-29T11:58:26.516132Z"
    },
    "heading_collapsed": true
   },
   "source": [
    "## Index对象"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:10.391716Z",
     "start_time": "2020-05-12T07:56:10.389356Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Int64Index([2, 3, 5, 7, 11], dtype='int64')"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ind = pd.Index([2, 3, 5, 7, 11])\n",
    "ind"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hidden": true
   },
   "source": [
    "### Index 作为不可变数组"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:10.394650Z",
     "start_time": "2020-05-12T07:56:10.392400Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3, Int64Index([2, 5, 11], dtype='int64'))"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ind[1], ind[::2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:10.397455Z",
     "start_time": "2020-05-12T07:56:10.395332Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(5, (5,), 1, dtype('int64'))"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ind.size, ind.shape, ind.ndim, ind.dtype"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:10.433866Z",
     "start_time": "2020-05-12T07:56:10.398143Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "ename": "TypeError",
     "evalue": "Index does not support mutable operations",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-32-b8e208cdb545>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# 不可变对象\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mind\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36m__setitem__\u001b[0;34m(self, key, value)\u001b[0m\n\u001b[1;32m   3907\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3908\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m__setitem__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3909\u001b[0;31m         \u001b[0;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Index does not support mutable operations\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   3910\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3911\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m__getitem__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mTypeError\u001b[0m: Index does not support mutable operations"
     ]
    }
   ],
   "source": [
    "# 不可变对象\n",
    "ind[1] = 0"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true,
    "hidden": true
   },
   "source": [
    "### Index 作为有序集合对象\n",
    "可以进行一系列集合操作\n",
    "- unions(并集), intersections(交集), differences(差)等"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:16.840257Z",
     "start_time": "2020-05-12T07:56:16.834661Z"
    },
    "hidden": true
   },
   "outputs": [],
   "source": [
    "indA = pd.Index([1, 3, 5, 7, 9])\n",
    "indB = pd.Index([2, 3, 5, 7, 11])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:17.659629Z",
     "start_time": "2020-05-12T07:56:17.653546Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Int64Index([3, 5, 7], dtype='int64')"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "indA & indB  # 交集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:18.272636Z",
     "start_time": "2020-05-12T07:56:18.266473Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "indA | indB  # 并集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:18.784617Z",
     "start_time": "2020-05-12T07:56:18.779041Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Int64Index([1, 2, 9, 11], dtype='int64')"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "indA ^ indB  # 异或"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:19.399633Z",
     "start_time": "2020-05-12T07:56:19.394182Z"
    },
    "hidden": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Int64Index([3, 5, 7], dtype='int64')"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 或者使用对象方法\n",
    "indA.intersection(indB)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 遍历Pandas数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:31.899181Z",
     "start_time": "2020-05-12T07:56:31.886930Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "      <th>D</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>9</td>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>8</td>\n",
       "      <td>6</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   A  B  C  D\n",
       "0  7  1  3  5\n",
       "1  0  8  7  0\n",
       "2  9  7  8  5\n",
       "3  2  2  1  5\n",
       "4  8  6  5  0\n",
       "\n",
       "[5 rows x 4 columns]"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import timeit\n",
    "df = pd.DataFrame(np.random.randint(0, 10, size=(100000, 4)),\n",
    "                  columns=list('ABCD'))\n",
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:56:47.992089Z",
     "start_time": "2020-05-12T07:56:39.261326Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.09 s ± 2.88 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "# for 循环\n",
    "def loop_with_for(df):\n",
    "    temp = 0\n",
    "    for index in range(len(df)):\n",
    "        temp += df['A'].iloc[index] + df['B'].iloc[index]\n",
    "    return temp\n",
    "\n",
    "\n",
    "%timeit loop_with_for(df)\n",
    "# %prun -l 4 loop_with_for(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:57:37.983415Z",
     "start_time": "2020-05-12T07:56:50.322941Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5.95 s ± 8.44 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "# pd的iterrows方法\n",
    "def loop_with_iterrows(df):\n",
    "    temp = 0\n",
    "    for _, row in df.iterrows():\n",
    "        temp += row.A + row.B\n",
    "    return temp\n",
    "\n",
    "\n",
    "%timeit loop_with_iterrows(df)\n",
    "# %prun -l 4 loop_with_iterrows(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:57:41.959464Z",
     "start_time": "2020-05-12T07:57:37.991856Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "49 ms ± 219 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "# pd的itertuples函数\n",
    "def loop_with_itertuples(df):\n",
    "    temp = 0\n",
    "    for row_tuple in df.itertuples():\n",
    "        temp += row_tuple.A + row_tuple.B\n",
    "    return temp\n",
    "\n",
    "\n",
    "%timeit loop_with_itertuples(df)\n",
    "# %prun -l 4 loop_with_itertuples(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:57:50.532699Z",
     "start_time": "2020-05-12T07:57:41.967727Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10.6 ms ± 2.75 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
     ]
    }
   ],
   "source": [
    "# zip函数\n",
    "def loop_with_zip(df):\n",
    "    temp = 0\n",
    "    for a, b in zip(df['A'], df['B']):\n",
    "        temp += a + b\n",
    "    return temp\n",
    "\n",
    "\n",
    "%timeit loop_with_zip(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:58:02.081024Z",
     "start_time": "2020-05-12T07:57:50.540882Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.44 s ± 11.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "# pd的apply函数\n",
    "def using_apply(df):\n",
    "    return df.apply(lambda x: x['A'] + x['B'], axis=1).sum()\n",
    "\n",
    "\n",
    "%timeit using_apply(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:58:11.589999Z",
     "start_time": "2020-05-12T07:58:02.089400Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "117 µs ± 79.5 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
     ]
    }
   ],
   "source": [
    "# numpy的内置函数\n",
    "def using_numpy_builtin(df):\n",
    "    return (df['A'].values + df['B'].values).sum()\n",
    "\n",
    "\n",
    "%timeit using_numpy_builtin(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 处理时间"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-12T07:58:37.853310Z",
     "start_time": "2020-05-12T07:58:37.839739Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   Day  Month  Year  data\n",
      "0   10      8  2018  1.80\n",
      "4   11      8  2018  2.29\n",
      "8   12      8  2018  1.78\n",
      "2   10      9  2018  1.00\n",
      "6   11      9  2018 -0.95\n",
      "\n",
      "[5 rows x 4 columns]\n",
      "        date  data\n",
      "0 2018-08-10  1.80\n",
      "4 2018-08-11  2.29\n",
      "8 2018-08-12  1.78\n",
      "2 2018-09-10  1.00\n",
      "6 2018-09-11 -0.95\n",
      "\n",
      "[5 rows x 2 columns]\n"
     ]
    }
   ],
   "source": [
    "# 处理时间\n",
    "from itertools import product\n",
    "col_names = [\"Day\", \"Month\", \"Year\"]\n",
    "df = pd.DataFrame(list(product([10, 11, 12], [8, 9], [2018, 2019])),\n",
    "                  columns=col_names)\n",
    "\n",
    "df['data'] = np.random.randn(len(df))\n",
    "\n",
    "df = df.sort_values(['Year', 'Month'], ascending=[True, True])\n",
    "print(df.head())\n",
    "\n",
    "df.insert(loc=0, column=\"date\", value=pd.to_datetime(df[col_names]))\n",
    "df = df.drop(col_names, axis=1).squeeze()\n",
    "print(df.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "410px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
